package edu.berkeley.nlp.PCFGLA;

import edu.berkeley.nlp.PCFGLA.smoothing.Smoother;
import edu.berkeley.nlp.syntax.StateSet;
import edu.berkeley.nlp.syntax.Tree;
import edu.berkeley.nlp.util.ArrayUtil;
import fig.basic.Indexer;
import edu.berkeley.nlp.util.Numberer;
import edu.berkeley.nlp.util.ScalingTools;

import java.io.Serializable;
import java.util.*;

/**
 * Simple default implementation of a lexicon, 
 * which scores word, tag pairs with P(word|tag)
 *
 * instead of dealing with words as strings we will map them to integers
 * with a wordIndexer. to further simplify things each tag will have 
 * its own mapping from wordIndex to a tag-specific index so that we
 * don't have to deal with unobserved events
 * 
 * assumes that rare words have been replaced with some unknown word token
 */
public class SimpleLexicon implements java.io.Serializable, Lexicon {
	public IntegerIndexer[] tagWordIndexer;
	public double[][][] expectedCounts; // indexed by tag, substate, word 
	public double[][][] scores; // indexed by tag, word, substate, substate
	
	public int[] wordCounter; // how many times each word occured
//	public boolean[] wordIsAmbiguous;
	
  /** A trick to allow loading of saved Lexicons even if the version has changed. */
  private static final long serialVersionUID = 2L;
  /** The number of substates for each state */
  public short[] numSubStates;
  int numStates;
  int nWords;
  
  double threshold;
  boolean isLogarithmMode;
  boolean useVarDP = false;
  
  public Indexer<String> wordIndexer;
  Smoother smoother;
  
  // additions from the stanford parser which are needed for a better 
  // unknown word model...
  /**
   * We cache the last signature looked up, because it asks for the same one
   * many times when an unknown word is encountered!  (Note that under the
   * current scheme, one unknown word, if seen sentence-initially and
   * non-initially, will be parsed with two different signatures....)
   */
  protected transient String lastSignature = "";
  protected transient int lastSentencePosition = -1;
  protected transient String lastWordToSignaturize = "";
  private int unknownLevel = 5; //different modes for unknown words, 5 is english specific

  public void optimize() {  	
  	for (int tag=0; tag<expectedCounts.length; tag++){
  		for (int substate=0; substate<numSubStates[tag]; substate++){
  			double mass = ArrayUtil.sum(expectedCounts[tag][substate]);

  			double normalizer = (mass==0) ? 0 : 1.0/mass;
	  		for (int word=0; word<expectedCounts[tag][substate].length; word++){
	  			scores[tag][substate][word] = expectedCounts[tag][substate][word]*normalizer;
	  		}
  		}
  	}
   	
   	// smooth the scores
   	if (smoother!=null){
	  	for (short tag=0; tag<expectedCounts.length; tag++){
	  		for (int word=0; word<expectedCounts[tag][0].length; word++){
	  			double[] res = new double[numSubStates[tag]];
	  			for (int substate=0; substate<numSubStates[tag]; substate++){
	  				res[substate] = scores[tag][substate][word];
	  			}
	  			smoother.smooth(tag,res);
	  			for (int substate=0; substate<numSubStates[tag]; substate++){
	  				scores[tag][substate][word] = res[substate];
	  			}
	  		}
	  	}
  	}
  }
  
  /** Create a blank Lexicon object.  Fill it by
   * calling tallyStateSetTree for each training tree, then
   * calling optimize().
   * 
   * @param numSubStates
   */
  @SuppressWarnings("unchecked")
	public SimpleLexicon(short[] numSubStates, int smoothingCutoff, double[] smoothParam, 
			Smoother smoother, double threshold, StateSetTreeList trainTrees) {
  	this(numSubStates, threshold);
  	init(trainTrees);
  }


	public SimpleLexicon(short[] numSubStates, double threshold) {
  	this.numSubStates = numSubStates;
    this.threshold = threshold;
    this.wordIndexer = new Indexer<String>();
    this.numStates = numSubStates.length;
    this.isLogarithmMode = false;
    if (Corpus.myTreebank != Corpus.TreeBankType.WSJ || Corpus.myTreebank == Corpus.TreeBankType.BROWN)
    	unknownLevel = 4;

  }

  public double[] score(String word, short tag, int pos, boolean noSmoothing, boolean isSignature) {
  	StateSet stateSet = new StateSet(tag, (short)1, word, (short)pos, (short)(pos+1));
  	stateSet.wordIndex = -2;
  	stateSet.sigIndex = -2;
  	return score(stateSet,tag,noSmoothing,isSignature);
  }
	
	public double[] score(StateSet stateSet, short tag, boolean noSmoothing, boolean isSignature) {
		double[] res = new double[numSubStates[tag]];
		int globalWordIndex = stateSet.wordIndex;
		if (globalWordIndex==-2)
			globalWordIndex = stateSet.wordIndex = wordIndexer.indexOf(stateSet.getWord());
		if (globalWordIndex==-1)
			globalWordIndex = stateSet.sigIndex;
		if (globalWordIndex==-2) 
			globalWordIndex = stateSet.sigIndex = wordIndexer.indexOf(getSignature(stateSet.getWord(), stateSet.from));
		if (globalWordIndex==-1){
			System.out.println("unknown signature for word "+stateSet.getWord());
			Arrays.fill(res, 0.001);
			return res;
		}

		int tagSpecificWordIndex = tagWordIndexer[tag].indexOf(globalWordIndex);
		if (tagSpecificWordIndex==-1){
			if (isLogarithmMode) Arrays.fill(res, Double.NEGATIVE_INFINITY);//-80??Double.NEGATIVE_INFINITY);
//			else Arrays.fill(res, 1e-80);
			return res;
		}
		for (int i=0; i<numSubStates[tag]; i++){
			res[i] = scores[tag][i][tagSpecificWordIndex]; 
		}
    if (smoother!=null) smoother.smooth(tag,res);
		return res;
  } 


  /**
   * Trains this lexicon on the Collection of trees.
   */
  public void trainTree(Tree<StateSet> trainTree, double randomness, Lexicon oldLexicon,
  		boolean secondHalf, boolean noSmoothing) {
  	// scan data
    //for all substates that the word's preterminal tag has
    double sentenceScore = 0;
    if (randomness == -1){
      sentenceScore = trainTree.getLabel().getIScore(0);
  		if (sentenceScore==0){
  			System.out.println("Something is wrong with this tree. I will skip it.");
  			return;
  		}
    }
    int sentenceScale = trainTree.getLabel().getIScale();
  	
  	List<StateSet> words = trainTree.getYield();
  	List<StateSet> tags = trainTree.getPreTerminalYield();
  	//for all words in sentence
  	for (int position = 0; position < words.size(); position++) {
  		
  		int nSubStates = tags.get(position).numSubStates();
  		short tag = tags.get(position).getState();
  		
  		String word = words.get(position).getWord();
  		int globalWordIndex = wordIndexer.indexOf(word);
  		int tagSpecificWordIndex = tagWordIndexer[tag].indexOf(globalWordIndex);
  		
  		double[] oldLexiconScores = null;
  		if (randomness==-1)
  			oldLexiconScores = oldLexicon.score(word,tag,position,noSmoothing,false);
  		
  		StateSet currentState = tags.get(position);
      double scale = ScalingTools.calcScaleFactor(currentState.getOScale()-sentenceScale) / sentenceScore;
  		
      for (short substate=0; substate<nSubStates; substate++) {
        double weight = 1;
        if (randomness == -1) {
          //weight by the probability of seeing the tag and word together, given the sentence
          if (!Double.isInfinite(scale))
            weight = currentState.getOScore(substate) * oldLexiconScores[substate] * scale;
          else
          	weight = Math.exp(Math.log(ScalingTools.SCALE)
								* (currentState.getOScale() - sentenceScale)
								- Math.log(sentenceScore)
								+ Math.log(currentState.getOScore(substate))
								+ Math.log(oldLexiconScores[substate]));
        }
        else if (randomness==0){
          // for the baseline
          weight = 1;
        }
        else {
          //add a bit of randomness
          weight = GrammarTrainer.RANDOM.nextDouble()*randomness/100.0+1.0;
        }
        if (weight==0)
        	continue;
        //tally in the tag with the given weight
        
	  		expectedCounts[tag][substate][tagSpecificWordIndex] += weight;
      }
  	}
	  		
  }

  public void setUseVarDP(boolean useVarDP) {
		this.useVarDP = useVarDP;
	}

  /*
   * assume that rare words have been replaced by their signature
   */
  public void init(StateSetTreeList trainTrees){
  	for (Tree<StateSet> tree : trainTrees){
  		List<StateSet> words = tree.getYield();
  		for (StateSet word : words){
				String sig = word.getWord();
				wordIndexer.add(sig);
  		}
  	}
  	tagWordIndexer = new IntegerIndexer[numStates];
  	for (int tag=0; tag<numStates; tag++){
  		tagWordIndexer[tag] = new IntegerIndexer(wordIndexer.size());
  	}
  	wordCounter = new int[wordIndexer.size()];
  	for (Tree<StateSet> tree : trainTrees){
  		List<StateSet> tags = tree.getPreTerminalYield();
  		List<StateSet> words = tree.getYield();
  		int ind = 0;
  		for (StateSet word : words){
				String sig = word.getWord();
				wordCounter[wordIndexer.indexOf(sig)]++;
				tagWordIndexer[tags.get(ind).getState()].add(wordIndexer.indexOf(sig));
  			ind++;
  		}  		
  	}
  	expectedCounts = new double[numStates][][];
  	scores = new double[numStates][][];
  	for (int tag=0; tag<numStates; tag++){
  		expectedCounts[tag] = new double[numSubStates[tag]][tagWordIndexer[tag].size()];
  		scores[tag] = new double[numSubStates[tag]][tagWordIndexer[tag].size()];
  	}
  	nWords = wordIndexer.size();
  	labelTrees(trainTrees);
  }
  
  
  
  public SimpleLexicon copyLexicon(){
  	SimpleLexicon copy = new SimpleLexicon(numSubStates,threshold);
  	copy.expectedCounts = new double[numStates][][];
  	copy.scores = ArrayUtil.clone(scores);//new double[numStates][][];
  	copy.tagWordIndexer = new IntegerIndexer[numStates];
  	copy.wordIndexer = this.wordIndexer;
  	for (int tag=0; tag<numStates; tag++){
  		copy.tagWordIndexer[tag] = tagWordIndexer[tag].copy();
  		copy.expectedCounts[tag] = new double[numSubStates[tag]][tagWordIndexer[tag].size()];
//  		copy.scores[tag] = new double[numSubStates[tag]][tagWordIndexer[tag].size()];
  	}
  	copy.nWords = this.nWords;
  	copy.smoother = this.smoother;
  	copy.wordCounter = this.wordCounter.clone();
//  	copy.wordIsAmbiguous = this.wordIsAmbiguous.clone();
//  	copy.unkIndex = unkIndex;
/*  	if (linearIndex!=null) copy.linearIndex = ArrayUtil.clone(linearIndex);
  	if (toBeIgnored!=null) copy.toBeIgnored = toBeIgnored.clone();*/ 	
  	return copy;
  }
  
	public boolean isLogarithmMode() {
		return isLogarithmMode;
	}
	
	public void logarithmMode() {
		if (isLogarithmMode) return;
  	for (int tag=0; tag<scores.length; tag++){
  		for (int word=0; word<scores[tag].length; word++){
  			for (int substate=0; substate<scores[tag][word].length; substate++){
  				scores[tag][word][substate] = Math.log(scores[tag][word][substate]); 
  			}
  		}
  	}
  	isLogarithmMode = true;
	}

	/**
	 * Split all substates in two, producing a new lexicon. The new Lexicon gives
	 * the same scores to words under both split versions of the tag. (Leon says:
	 * It may not be okay to use the same scores, but I think that symmetry is
	 * sufficiently broken in Grammar.splitAllStates to ignore the randomness
	 * here.)
	 * 
	 * @param randomness
	 *          (currently ignored)
	 * @param mode
	 *      0 or 1: previous value plus noise
	 *      2: just noise (for log-linear grammars with cascading regularization)
	 * @return
	 */
	@SuppressWarnings("unchecked")
	public SimpleLexicon splitAllStates(int[] counts, boolean moreSubstatesThanCounts, int mode) {
		SimpleLexicon splitLex = this.copyLexicon();
		
		short[] newNumSubStates = new short[numSubStates.length];
		newNumSubStates[0] = 1; // never split ROOT
		Random random = GrammarTrainer.RANDOM;
		for (short i = 1; i < numSubStates.length; i++) {
			// don't split a state into more substates than times it was actaully seen
//			if (!moreSubstatesThanCounts && numSubStates[i]>=counts[i]) { 
//				newNumSubStates[i]=numSubStates[i];
//				} 
//			else{
				newNumSubStates[i] = (short)(numSubStates[i] * 2);
//			}
		}
		splitLex.numSubStates = newNumSubStates;
		double[][][] newScores = new double[scores.length][][];
		double[][][] newExpCounts = new double[scores.length][][];
  	for (int tag=0; tag<expectedCounts.length; tag++){
  		int nTagWords = tagWordIndexer[tag].size();
//  		if (nWords==0) continue;
  		newScores[tag] = new double[newNumSubStates[tag]][nTagWords];
  		newExpCounts[tag] = new double[newNumSubStates[tag]][nTagWords];
  		for (int substate=0; substate<numSubStates[tag]; substate++){
  			for (int word=0; word<expectedCounts[tag][substate].length; word++){
	  			newScores[tag][2*substate][word] = newScores[tag][2*substate+1][word] = scores[tag][substate][word];
	  			if (mode==2)
	  				newScores[tag][2*substate][word] = newScores[tag][2*substate+1][word] = 1.0+random.nextDouble()/100.0;
	  		}
  		}
  	}
  	splitLex.scores = newScores;
  	splitLex.expectedCounts = newExpCounts;
		return splitLex;
	}
	
  /**
   * This routine returns a String that is the "signature" of the class of a
   * word.
   * For, example, it might represent whether it is a number of ends in -s.
   * The strings returned by convention match the pattern UNK-.* , which
   * is just assumed to not match any real word.
   * Behavior depends on the unknownLevel (-uwm flag) passed in to the class.
   * The recognized numbers are 1-5: 5 is fairly English-specific; 4, 3, and 2
   * look for various word features (digits, dashes, etc.) which are only
   * vaguely English-specific; 1 uses the last two characters combined with 
   * a simple classification by capitalization.
   *
   * @param word The word to make a signature for
   * @param loc  Its position in the sentence (mainly so sentence-initial
   *             capitalized words can be treated differently)
   * @return A String that is its signature (equivalence class)
   */
  public String getNewSignature(String word, int loc) {
    //    int unknownLevel = Options.get().useUnknownWordSignatures;
    StringBuffer sb = new StringBuffer("UNK");
    switch (unknownLevel) {

      case 5:
        {
          // Reformed Mar 2004 (cdm); hopefully much better now.
          // { -CAPS, -INITC ap, -LC lowercase, 0 } +
          // { -KNOWNLC, 0 } +          [only for INITC]
          // { -NUM, 0 } +
          // { -DASH, 0 } +
          // { -last lowered char(s) if known discriminating suffix, 0}
          int wlen = word.length();
          int numCaps = 0;
          boolean hasDigit = false;
          boolean hasDash = false;
          boolean hasLower = false;
          for (int i = 0; i < wlen; i++) {
            char ch = word.charAt(i);
            if (Character.isDigit(ch)) {
              hasDigit = true;
            } else if (ch == '-') {
              hasDash = true;
            } else if (Character.isLetter(ch)) {
              if (Character.isLowerCase(ch)) {
                hasLower = true;
              } else if (Character.isTitleCase(ch)) {
                hasLower = true;
                numCaps++;
              } else {
                numCaps++;
              }
            }
          }
          char ch0 = word.charAt(0);
          String lowered = word.toLowerCase();
          if (Character.isUpperCase(ch0) || Character.isTitleCase(ch0)) {
            if (loc == 0 && numCaps == 1) {
              sb.append("-INITC");
              if (isKnown(lowered)) {
                sb.append("-KNOWNLC");
              }
            } else {
              sb.append("-CAPS");
            }
          } else if (!Character.isLetter(ch0) && numCaps > 0) {
            sb.append("-CAPS");
          } else if (hasLower) { // (Character.isLowerCase(ch0)) {
            sb.append("-LC");
          }
          if (hasDigit) {
            sb.append("-NUM");
          }
          if (hasDash) {
            sb.append("-DASH");
          }
          if (lowered.endsWith("s") && wlen >= 3) {
            // here length 3, so you don't miss out on ones like 80s
            char ch2 = lowered.charAt(wlen - 2);
            // not -ess suffixes or greek/latin -us, -is
            if (ch2 != 's' && ch2 != 'i' && ch2 != 'u') {
              sb.append("-s");
            }
          } else if (word.length() >= 5 && !hasDash && !(hasDigit && numCaps > 0)) {
            // don't do for very short words;
            // Implement common discriminating suffixes
/*          	if (Corpus.myLanguage==Corpus.GERMAN){
          		sb.append(lowered.substring(lowered.length()-1));
          	}else{*/
            if (lowered.endsWith("ed")) {
              sb.append("-ed");
            } else if (lowered.endsWith("ing")) {
              sb.append("-ing");
            } else if (lowered.endsWith("ion")) {
              sb.append("-ion");
            } else if (lowered.endsWith("er")) {
              sb.append("-er");
            } else if (lowered.endsWith("est")) {
              sb.append("-est");
            } else if (lowered.endsWith("ly")) {
              sb.append("-ly");
            } else if (lowered.endsWith("ity")) {
              sb.append("-ity");
            } else if (lowered.endsWith("y")) {
              sb.append("-y");
            } else if (lowered.endsWith("al")) {
              sb.append("-al");
              // } else if (lowered.endsWith("ble")) {
              // sb.append("-ble");
              // } else if (lowered.endsWith("e")) {
              // sb.append("-e");
            }
          }
          break;
        }

      case 4:
        {
          boolean hasDigit = false;
          boolean hasNonDigit = false;
          boolean hasLetter = false;
          boolean hasLower = false;
          boolean hasDash = false;
          boolean hasPeriod = false;
          boolean hasComma = false;
          for (int i = 0; i < word.length(); i++) {
            char ch = word.charAt(i);
            if (Character.isDigit(ch)) {
              hasDigit = true;
            } else {
              hasNonDigit = true;
              if (Character.isLetter(ch)) {
                hasLetter = true;
                if (Character.isLowerCase(ch) || Character.isTitleCase(ch)) {
                  hasLower = true;
                }
              } else {
                if (ch == '-') {
                  hasDash = true;
                } else if (ch == '.') {
                  hasPeriod = true;
                } else if (ch == ',') {
                  hasComma = true;
                }
              }
            }
          }
          // 6 way on letters
          if (Character.isUpperCase(word.charAt(0)) || Character.isTitleCase(word.charAt(0))) {
            if (!hasLower) {
              sb.append("-AC");
            } else if (loc == 0) {
              sb.append("-SC");
            } else {
              sb.append("-C");
            }
          } else if (hasLower) {
            sb.append("-L");
          } else if (hasLetter) {
            sb.append("-U");
          } else {
            // no letter
            sb.append("-S");
          }
          // 3 way on number
          if (hasDigit && !hasNonDigit) {
            sb.append("-N");
          } else if (hasDigit) {
            sb.append("-n");
          }
          // binary on period, dash, comma
          if (hasDash) {
            sb.append("-H");
          }
          if (hasPeriod) {
            sb.append("-P");
          }
          if (hasComma) {
            sb.append("-C");
          }
          if (word.length() > 3) {
            // don't do for very short words: "yes" isn't an "-es" word
            // try doing to lower for further densening and skipping digits
            char ch = word.charAt(word.length() - 1);
            if (Character.isLetter(ch)) {
              sb.append("-");
              sb.append(Character.toLowerCase(ch));
            }
          }
          break;
        }

      case 3:
        {
          // This basically works right, except note that 'S' is applied to all
          // capitalized letters in first word of sentence, not just first....
          sb.append("-");
          char lastClass = '-';  // i.e., nothing
          char newClass;
          int num = 0;
          for (int i = 0; i < word.length(); i++) {
            char ch = word.charAt(i);
            if (Character.isUpperCase(ch) || Character.isTitleCase(ch)) {
              if (loc == 0) {
                newClass = 'S';
              } else {
                newClass = 'L';
              }
            } else if (Character.isLetter(ch)) {
              newClass = 'l';
            } else if (Character.isDigit(ch)) {
              newClass = 'd';
            } else if (ch == '-') {
              newClass = 'h';
            } else if (ch == '.') {
              newClass = 'p';
            } else {
              newClass = 's';
            }
            if (newClass != lastClass) {
              lastClass = newClass;
              sb.append(lastClass);
              num = 1;
            } else {
              if (num < 2) {
                sb.append('+');
              }
              num++;
            }
          }
          if (word.length() > 3) {
            // don't do for very short words: "yes" isn't an "-es" word
            // try doing to lower for further densening and skipping digits
            char ch = Character.toLowerCase(word.charAt(word.length() - 1));
            sb.append('-');
            sb.append(ch);
          }
          break;
        }

      case 2:
        {
          // {-ALLC, -INIT, -UC, -LC, zero} +
          // {-DASH, zero} +
          // {-NUM, -DIG, zero} +
          // {lowerLastChar, zeroIfShort}
          boolean hasDigit = false;
          boolean hasNonDigit = false;
          boolean hasLower = false;
          for (int i = 0; i < word.length(); i++) {
            char ch = word.charAt(i);
            if (Character.isDigit(ch)) {
              hasDigit = true;
            } else {
              hasNonDigit = true;
              if (Character.isLetter(ch)) {
                if (Character.isLowerCase(ch) || Character.isTitleCase(ch)) {
                  hasLower = true;
                }
              }
            }
          }
          if (Character.isUpperCase(word.charAt(0)) || Character.isTitleCase(word.charAt(0))) {
            if (!hasLower) {
              sb.append("-ALLC");
            } else if (loc == 0) {
              sb.append("-INIT");
            } else {
              sb.append("-UC");
            }
          } else if (hasLower) {   // if (Character.isLowerCase(word.charAt(0))) {
            sb.append("-LC");
          }
          // no suffix = no (lowercase) letters
          if (word.indexOf('-') >= 0) {
            sb.append("-DASH");
          }
          if (hasDigit) {
            if (!hasNonDigit) {
              sb.append("-NUM");
            } else {
              sb.append("-DIG");
            }
          } else if (word.length() > 3) {
            // don't do for very short words: "yes" isn't an "-es" word
            // try doing to lower for further densening and skipping digits
            char ch = word.charAt(word.length() - 1);
            sb.append(Character.toLowerCase(ch));
          }
          // no suffix = short non-number, non-alphabetic
          break;
        }

      default:
        sb.append("-");
        sb.append(word.substring(Math.max(word.length() - 2, 0), word.length()));
        sb.append("-");
        if (Character.isLowerCase(word.charAt(0))) {
          sb.append("LOWER");
        } else {
          if (Character.isUpperCase(word.charAt(0))) {
            if (loc == 0) {
              sb.append("INIT");
            } else {
              sb.append("UPPER");
            }
          } else {
            sb.append("OTHER");
          }
        }
    } // end switch (unknownLevel)
    // System.err.println("Summarized " + word + " to " + sb.toString());
    return sb.toString();
  } // end getSignature()

  
  public String toString() {
  	StringBuffer sb = new StringBuffer();
  	Numberer tagNumberer = Numberer.getGlobalNumberer("tags");
  	for (int tag=0; tag<expectedCounts.length; tag++){
  		String tagS = (String)tagNumberer.object(tag);
  		if (tagWordIndexer[tag].size()==0) continue;
  		for (int word=0; word<scores[tag][0].length; word++){
  			sb.append(tagS+" "+ wordIndexer.get(tagWordIndexer[tag].get(word))+" ");	
  			for (int sub=0; sub<numSubStates[tag]; sub++){
  				sb.append(" " + scores[tag][sub][word]);
  			}
  			sb.append("\n");
  		}
  	}
  	return sb.toString();
  }


   
  /**
	 * @param lowered
	 * @return
	 */
	private boolean isKnown(String word) {
		return wordIndexer.indexOf(word)!=-1;
	}

	/**
   * Returns the index of the signature of the word numbered wordIndex,
   * where the signature is the String representation of unknown word
   * features.  Caches the last signature index returned.
   */
  public String getSignature(String word, int sentencePosition) {
  	if (word.equals(lastWordToSignaturize) && sentencePosition == lastSentencePosition) {
  		// System.err.println("Signature: cache mapped " + wordIndex + " to " + lastSignatureIndex);
  		return lastSignature;
  	} else {
  		String uwSig = getNewSignature(word, sentencePosition);
  		lastSignature = uwSig;
  		lastSentencePosition = sentencePosition;
  		lastWordToSignaturize = word;
  		return uwSig;
  	}
  }
  
  /**
   * @param mergeThesePairs
   * @param mergeWeights
   */
  public void mergeStates(boolean[][][] mergeThesePairs, double[][] mergeWeights) {
		short[] newNumSubStates = new short[numSubStates.length];
		short[][] mapping = new short[numSubStates.length][];
		//invariant: if partners[state][substate][0] == substate, it's the 1st one
		short[][][] partners = new short[numSubStates.length][][];
		Grammar.calculateMergeArrays(mergeThesePairs,newNumSubStates,mapping,partners,numSubStates);

		double[][][] newScores = new double[scores.length][][];
		for (int tag=0; tag<expectedCounts.length; tag++){
  		int nTagWords = tagWordIndexer[tag].size();
  		newScores[tag] = new double[newNumSubStates[tag]][nTagWords];
  		if (numSubStates[tag]==1) continue;
			for (int word=0; word<expectedCounts[tag][0].length; word++){
				for (int i=0; i<numSubStates[tag]; i=i+2) {
					int nSplit=partners[tag][i].length;
					if (nSplit==2) {
						double mergeWeightSum = mergeWeights[tag][partners[tag][i][0]] + mergeWeights[tag][partners[tag][i][1]];
						if (mergeWeightSum==0) mergeWeightSum = 1;
						newScores[tag][mapping[tag][i]][word] = 
							((mergeWeights[tag][partners[tag][i][0]] * scores[tag][partners[tag][i][0]][word])+
							 (mergeWeights[tag][partners[tag][i][1]] * scores[tag][partners[tag][i][1]][word])) / mergeWeightSum;
					} else {
						newScores[tag][mapping[tag][i]][word] = scores[tag][i][word];
						newScores[tag][mapping[tag][i+1]][word] = scores[tag][i+1][word];
					}
				}
			}
  	}
  	this.numSubStates = newNumSubStates;
  	this.scores = newScores;
  	for (int tag=0; tag<numStates; tag++){
  		this.expectedCounts[tag] = new double[newNumSubStates[tag]][tagWordIndexer[tag].size()];
  	}
  }
  
  
  public void removeUnlikelyTags(double threshold, double exponent){
  	for (int tag=0; tag<scores.length; tag++){
  		for (int word=0; word<scores[tag].length; word++){
  			for (int substate=0; substate<scores[tag][word].length; substate++){
  				double p = scores[tag][word][substate];
  				/*if (p<threshold) p = 0;
  				else */if (exponent!=1.0) p = Math.pow(p, exponent);
  				scores[tag][word][substate] = p; 
  			}
  		}
  	}
  }
  
  
//  public void logarithmMode() {
//  	logarithmMode = true;
//  }
//
//	public boolean isLogarithmMode() {
//		return logarithmMode;
//	}

	public SimpleLexicon projectLexicon(double[] condProbs, int[][] mapping, int[][] toSubstateMapping) {
		short[] newNumSubStates = new short[numSubStates.length];
		for (int state=0; state<numSubStates.length; state++){
			newNumSubStates[state] = (short)toSubstateMapping[state][0];
		}
		SimpleLexicon newLexicon = this.copyLexicon();

		double[][][] newScores = new double[scores.length][][];
		
		
  	for (short tag=0; tag<expectedCounts.length; tag++){
  		newScores[tag] = new double[newNumSubStates[tag]][expectedCounts[tag][0].length];
  		for (int word=0; word<expectedCounts[tag][0].length; word++){
  			for (int substate=0; substate<numSubStates[tag]; substate++){
  				newScores[tag][toSubstateMapping[tag][substate+1]][word] += 
  					condProbs[mapping[tag][substate]]*scores[tag][substate][word];
  			}
			}
		}
  	newLexicon.numSubStates = newNumSubStates;
  	newLexicon.scores = newScores;
		return newLexicon;
	}
	

	public Smoother getSmoother() {
		return smoother;
	}

	/* (non-Javadoc)
	 * @see edu.berkeley.nlp.HDPPCFG.LexiconInterface#getSmoothingParams()
	 */
	public double[] getSmoothingParams() {
		// TODO Auto-generated method stub
		return null;
	}

	/* (non-Javadoc)
	 * @see edu.berkeley.nlp.HDPPCFG.LexiconInterface#logarithmMode()
	 */

	public void setSmoother(Smoother smoother) {
		this.smoother = smoother;
	}

	public double getPruningThreshold() {
		return threshold;
	}

	/* (non-Javadoc)
	 * @see edu.berkeley.nlp.PCFGLA.Lexicon#scoreSignature(java.lang.String, short, int)
	 */
	public double[] scoreSignature(StateSet stateSet, int tag) {
		// TODO Auto-generated method stub
		return null;
	}

	/* (non-Javadoc)
	 * @see edu.berkeley.nlp.PCFGLA.Lexicon#scoreWord(java.lang.String, short)
	 */
	public double[] scoreWord(StateSet stateSet, int tag) {
		// TODO Auto-generated method stub
		return null;
	}

	public void labelTrees(StateSetTreeList trainTrees){
  	for (Tree<StateSet> tree : trainTrees){
  		List<StateSet> words = tree.getYield();
//  		List<StateSet> tags = tree.getPreTerminalYield();
//  		int ind = 0;
  		for (StateSet word : words){
  			word.wordIndex = wordIndexer.indexOf(word.getWord());
  			word.sigIndex = -1;
//  			short tag = tags.get(ind).getState();
////				if (wordIsAmbiguous[word.wordIndex]) {
//					String sig = getSignature(word.getWord(), ind);
//					wordIndexer.add(sig);
//	  			word.sigIndex = (short)wordIndexer.indexOf(sig);
//					tagWordIndexer[tag].add(wordIndexer.indexOf(sig));
////				}
////				else { word.sigIndex = -1; }
//				ind++;
  		}  		
  	}

	}


	/*
	public void clearMapping() {
		toBeIgnored = null;
		linearIndex = null;
	}

*/
	public class IntegerIndexer implements Serializable{
		private int[] indexTo;
		private int[] indexFrom;
		private int n;
		
		IntegerIndexer(int capacity){
			indexTo = new int[capacity];
			indexFrom = new int[capacity];
			Arrays.fill(indexTo, -1);
			Arrays.fill(indexFrom, -1);
			n = 0;
		}
		
		public void add(int i){
			if (i==-1) return;
			if (indexTo[i]==-1){
				indexTo[i] = n;
				indexFrom[n] = i;
				n++;
			}
		}
		
		public int get(int i){
			if (i < indexFrom.length) return indexFrom[i];
			else return -1;
		}
		
		public int indexOf(int i){
			if (i < indexTo.length) return indexTo[i];
			else return -1;
		}
		
		public int size(){
			return n;
		}
		
		public IntegerIndexer copy(){
			IntegerIndexer copy = new IntegerIndexer(indexFrom.length);
			copy.n = n;
			copy.indexFrom = this.indexFrom.clone();
			copy.indexTo = this.indexTo.clone();
			return copy;
		}
	}


	/* (non-Javadoc)
	 * @see edu.berkeley.nlp.PCFGLA.Lexicon#computeScores()
	 */
	public void explicitlyComputeScores(int finalLevel) {
		// TODO Auto-generated method stub
		
	}
	
}
